Weekly Metrics and Time Series Clustering¶

  • In this notebook, code is made available that visualises weekly metrics by region (with hover annotations)
  • Weekly metrics are:
    1. Confirmed Cases (Cases)
    2. Cases per 100K capita
    3. Covid Deaths
    4. Covid Deaths per 100K capita
    5. Cases per Test
    6. Covid Deaths per Test
    7. Covid Deaths per Case
  • Key data availability callouts:
    • PL data (sourced fully from MichaÅ‚ Rogalski's Google Sheet - all metrics available
    • SE data (sourced from Martin BeneÅ¡' covid19sweden package - first four metrics available (test data not available)
    • CZ data (sourced from Martin BeneÅ¡' covid19czechia package - only deaths data extracted here (as of now)
    • In terms of dates:
      • PL tests available from w/o May 11, 2020 to date
      • PL cases/deaths data available from Mar 4, 2020 to date (mature data only available from Mon Mar 9, 2020)
      • SE cases/deaths data available from Dec 30, 2019 to Mar 15, 2021 (mature data only available till Sun Mar 14, 2021)
      • CZ deaths data available from Mar 16, 2020 to Mar 8, 2021 (mature data only available till Sun Mar 7, 2021)
  • Decision made to only use cases/deaths data from Mar 16, 2020 to Mar 7, 2021 (lowest common denominator)
  • Plan is to run DTW on all the said regional timeseries and cluster them with and without DTW and finally compare them with clusters previously created that use only population and population density metrics
    • Work to commence second half of w/o Apr 1st
In [1]:
import covid19czechia as CZ
import covid19sweden as SE
import numpy as np
import plotly.express as px
import pandas as pd
from datetime import datetime, date, timedelta
In [2]:
# utility function
def week_num_to_date(year, week_num):
    if year == 2020:
        return(datetime.strptime(str(year) + str(week_num) + '-1', "%Y%W-%w") - timedelta(days = 7))
    elif year == 2021:
        return(datetime.strptime(str(year) + str(week_num) + '-1', "%Y%W-%w"))
In [3]:
# aggregate PL data
#file_name = './data/pl_google_sheet/' + datetime.today().strftime('%Y%m%d') + '.xlsx'
file_name = './data/pl_google_sheet/' + '20210317' + '.xlsx'

pl_cases = pd.read_excel(io = file_name, 
                  sheet_name='Wzrost w województwach', 
                  dtype=object, 
                  engine='openpyxl', 
                  skiprows=30, 
                  nrows=16)
pl_value_vars = list(pl_cases.columns)[1:-2]
pl_cases = pd.melt(frame = pl_cases.iloc[:, 0:-2], 
            id_vars='Województwo', 
            value_vars=pl_value_vars, 
            var_name='date', 
            value_name='cases')
pl_cases['year'] = pl_cases['date'].apply(lambda x: int(x.year))
pl_cases['week'] = pl_cases['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_cases.loc[(pl_cases['week'] == 53) & (pl_cases['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_cases = pl_cases.groupby(['year', 'week', 'Województwo']).aggregate({'cases': 'sum'}).reset_index()
pl_cases['date'] = pl_cases.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)

pl_deaths = pd.read_excel(io = file_name, 
                  sheet_name='Wzrost w województwach', 
                  dtype=object, 
                  engine='openpyxl', 
                  skiprows=70, 
                  nrows=16)
pl_deaths = pd.melt(frame = pl_deaths.iloc[:, 0:-2], 
            id_vars='Województwo', 
            value_vars=pl_value_vars, 
            var_name='date', 
            value_name='deaths')
pl_deaths['year'] = pl_deaths['date'].apply(lambda x: int(x.year))
pl_deaths['week'] = pl_deaths['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_deaths.loc[(pl_deaths['week'] == 53) & (pl_deaths['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_deaths = pl_deaths.groupby(['year', 'week', 'Województwo']).aggregate({'deaths': 'sum'}).reset_index()
pl_deaths['date'] = pl_deaths.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)

pl_tests_new = pd.read_excel(io = file_name, 
                  sheet_name='Testy w województwach', 
                  dtype=object, 
                  engine='openpyxl', 
                  skiprows=3, 
                  nrows=16)
pl_tests_new = pd.melt(frame = pl_tests_new.iloc[:, 1:-2], 
            id_vars='Województwo', 
            value_vars=list(pl_tests_new.columns)[2:-2], 
            var_name='date', 
            value_name='tests')
pl_tests_new = pl_tests_new[pl_tests_new['date'] >= '2020-12-28'] # only pick up data from 28 Dec 2020 or later
pl_tests_new['year'] = pl_tests_new['date'].apply(lambda x: int(x.year))
pl_tests_new['week'] = pl_tests_new['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_tests_new.loc[(pl_tests_new['week'] == 53) & (pl_tests_new['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
pl_tests_new = pl_tests_new.groupby(['year', 'week', 'Województwo']).aggregate({'tests': 'sum'}).reset_index()
pl_tests_new['date'] = pl_tests_new.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)

pl_wkly_tests_old = pd.read_excel(io = file_name, 
                      sheet_name=' Testy w województwach od 11.05', # sheet name appears to have changed to 'Testy w województwach od 11.05 do 28.12.2020'
                      dtype=object, 
                      engine='openpyxl', 
                      skiprows=2, 
                      nrows=16, 
                      verbose=False, 
                      parse_dates=False, 
                      date_parser=None)
pl_wkly_tests_old.iloc[12, 0] = 'Świętokrzyskie'
pl_wkly_tests_old = pd.melt(frame = pl_wkly_tests_old.iloc[:, 0:-5], 
                        id_vars='Województwo', 
                        value_vars=list(pl_wkly_tests_old.columns)[1:-5], 
                        var_name='date', 
                        value_name='tests')
pl_wkly_tests_old['year'] = pl_wkly_tests_old['date'].apply(lambda x: int(x.year))
pl_wkly_tests_old['week'] = pl_wkly_tests_old['date'].apply(lambda x: int(x.isocalendar()[1]))
pl_wkly_tests_old = pl_wkly_tests_old[pl_wkly_tests_old['date'] < '2020-12-28'] # only pick up data from before 28 Dec 2020

pl_data = pd.merge(pd.merge(pl_cases, pl_deaths, on = ['Województwo', 'date', 'year', 'week'], how = 'left'),
    pd.concat([pl_tests_new, pl_wkly_tests_old]), on = ['Województwo', 'date', 'year', 'week'], how = 'left') \
    .rename({'Województwo':'name'}, axis = 1)

# aggregate SE data
se_data = SE.covid_deaths()
se_data = se_data.groupby(['year', 'week', 'region']) \
    .aggregate({'deaths':'sum', 'confirmed':'sum'}).reset_index()
se_data['date'] = se_data.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
se_data = se_data.rename({'confirmed':'cases'}, axis = 1)

# aggregate CZ data
cz_data = CZ.covid_deaths(level = 2)
cz_data['year'] = cz_data['date'].apply(lambda x: x.year)
cz_data.loc[(cz_data['week'] == 53) & (cz_data['date'].apply(lambda x:x.month) == 1), 'year'] -= 1
cz_data = cz_data.groupby(['year', 'week', 'region']).aggregate({'deaths': 'sum'}).reset_index()
cz_data['date'] = cz_data.apply(lambda r: week_num_to_date(r.year, r.week), axis = 1)
In [4]:
# set up the regions dataset
regions = pd.read_csv("./data/regions.csv")
regions.loc[regions['NUTS3'].isin(['SE214', 'SE322', 'SE221', 'SE212', 'SE213', 'SE321', \
                                   'SE332', 'SE331', 'SE312', 'SE311', 'SE313', 'SE124', \
                                   'SE122', 'SE125', 'CZ041', 'SE231', 'SE211', 'SE121']), 'cluster_1'] = 1

regions.loc[regions['NUTS3'].isin(['SE123', 'CZ051', 'CZ063', 'CZ053', 'CZ052', 'CZ032', \
                                   'CZ031', 'CZ072', 'CZ071']), 'cluster_1'] = 2

regions.loc[regions['NUTS3'].isin(['CZ042', 'PL52', 'PL43', 'PL84', 'CZ064', 'PL72', \
                                   'CZ080', 'CZ020', 'SE224', 'PL62', 'PL42', 'SE232']), 'cluster_1'] = 3

regions.loc[regions['NUTS3'].isin(['PL61', 'PL82', 'PL81', 'PL63', 'PL71', 'SE110']), 'cluster_1'] = 4

regions.loc[regions['NUTS3'].isin(['PL51', 'PL21', 'PL41', 'PL22', 'PL9']), 'cluster_1'] = 5

regions.loc[regions['NUTS3'] == 'CZ010', 'cluster_1'] = 6

regions['cluster_2'] = regions['cluster_1'] - 1
regions.loc[regions['cluster_2'] == 0, 'cluster_2'] = 1

regions = regions.rename(columns = {'NUTS3':'region'})
In [5]:
# cross reference the PL/CZ/SE datasets with info in the regions file
pl_data = pd.merge(regions, pl_data, on='name', how="inner")
cz_data = pd.merge(regions, cz_data, on='region', how="inner")
se_data = pd.merge(regions, se_data, on='region', how="inner")
all_data = pd.concat([pl_data, cz_data, se_data])
In [6]:
# fix datatype of 'tests' column (handle NaN)
all_data['tests'] = all_data['tests'].astype('float')

# compute deaths and cases per 100K capita, deaths per test, cases per test
all_data['cases_100K'] = all_data['cases']/all_data['population']*100000
all_data['deaths_100K'] = all_data['deaths']/all_data['population']*100000
all_data['cases_per_test'] = all_data['cases']/all_data['tests']
all_data['deaths_per_test'] = all_data['deaths']/all_data['tests']
all_data['deaths_per_case'] = all_data['deaths']/all_data['cases']

# concatenate NUTS code and region name
all_data['region'] = all_data['region'] + ' - ' + all_data['name']

# exclude immature data
all_data = all_data[all_data['date'] <= '2021-03-01'] # w/o Mar 7, 2021
all_data = all_data[all_data['date'] >= '2020-03-16']
In [7]:
# visualise trends in all regions
fig = px.line(all_data[pd.notnull(all_data["cases"])], x="date", y="cases", color="region",
             title="Weekly Cases by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["deaths"])], x="date", y="deaths", color="region",
             title="Weekly Deaths by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["cases_100K"])], x="date", y="cases_100K", color="region",
             title="Weekly Cases per 100K Capita by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["deaths_100K"])], x="date", y="deaths_100K", color="region",
             title="Weekly Deaths per 100K Capita by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["tests"])], x="date", y="tests", color="region",
             title="Weekly Tests by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["cases_per_test"])], x="date", y="cases_per_test", color="region",
             title="Weekly Cases per Test by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["deaths_per_test"])], x="date", y="deaths_per_test", color="region",
             title="Weekly Deaths per Test by Region")
fig.show()

fig = px.line(all_data[pd.notnull(all_data["deaths_per_case"])], x="date", y="deaths_per_case", color="region",
             title="Weekly Deaths per Case by Region")
fig.show()